clear
capture restore
set scheme cleanplots, perm

* SET DIRECTORIES
global home "..."

global data "$home/Data/"
global output "$home/Output/"

cd $home

********************************************************************************
********************************************************************************
********************************************************************************
* 	Read-in S&P 500 Constituents Data and Clean
*
* 	This section creates a dataset with the dates in which a firm was listed in 
* 	S&P 500 index.
********************************************************************************
********************************************************************************
********************************************************************************
*Read-in raw S&P 500 constituents data
use "${data}sp500_constituents", clear

*Drop unnecessary variables
drop tic conm gvkeyx

rename co_cusip cusip

*Drop firms in S&P 500 before our sample period
drop if year(thru)<1995

*Need to impute date for firms currently in S&P500
*Note: The S&P 500 constituents file is no longer avaiable on Compustat as of
* 		4/2020, so S&P 500 firms are frozen from that point through end of 2020
replace thru=mdy(12,31,2020) if thru==.

*34 firms left the S&P 500, but then re-entered at a later date.
*Need to collapse into one observation
*Number the observations within a firm to check for duplicates
bysort gvkey cusip: gen seq = _n

*Tag one observation per unique firm/security couple
egen firm_sec_tag = tag(gvkey cusip)

*Find the firm's first entry date and second entry date (if applicable)
bysort gvkey cusip: egen from1 = min(from)
bysort gvkey cusip: egen from2 = max(from)

replace from2 = . if from2==from1

*Find the firm's first exit date and second exit date (if applicable)
bysort gvkey cusip: egen thru1 = min(thru)
bysort gvkey cusip: egen thru2 = max(thru)
replace thru2 = . if thru2==thru1

format from1 from2 thru1 thru2 %td

*Drop firm's duplicate observation
keep if firm_sec_tag==1

*Drop unnecessary variables
drop from thru seq firm_sec_tag

label var from1 "First Day CUSIP Listed in S&P 500 Index"
label var thru1 "Last Day CUSIP Listed in S&P 500 Index"
label var from2 "First Day CUSIP Listed in S&P 500 Index, 2nd time"
label var thru2 "Last Day CUSIP Listed in S&P 500 Index, 2nd time"

*Create 8-digit CUSIP consistent with ivol dataset
gen cusip_8 = substr(cusip,1,8)

sort cusip from1

save "${data}sp500_constituents_final", replace

********************************************************************************
********************************************************************************	  
********************************************************************************
*   Read-in Quarterly Compustat Data and Clean
*
* 	This section creates our main quarterly Compustat dataset, merging on the 
* 	data created in the above section. 
********************************************************************************
********************************************************************************
********************************************************************************
use "${data}Compustat_toMatch", clear

*Destring datacqtr to get year and quarter*
gen year_str = substr(datacqtr,1,4)
gen qtr_str  = substr(datacqtr,6,1)
destring year_str, gen(year)
destring qtr_str, gen(qtr)
drop year_str qtr_str datacqtr

****************************
***Merge on S&P 500 dates***
****************************
merge m:1 gvkey cusip using "${data}sp500_constituents_final"

*CUSIP on S&P 500 file is not unique to GVKEY, while it is in Compustat data
*So, need to collapse securities-level S&P dates to firm-level Compustat data
gen from3=.
gen thru3=.
*1 firm has two from/thru dates for each of its 2 CUSIPs
gen from4=.
gen thru4=.

forvalues i=1/2 {
	
	*Record S&P 500 entry/exit dates for firm securities that matched Compustat
	bysort gvkey _merge: egen from`i'_min=min(from`i') if _merge==3
	bysort gvkey _merge: egen thru`i'_min=min(thru`i') if _merge==3
	
	local j=`i'+2
	
	*Record S&P 500 entry/exit dates for firm securities that didn't match Compustat
	bysort gvkey _merge: egen from`j'_min=min(from`i') if _merge==2
	bysort gvkey _merge: egen thru`j'_min=min(thru`i') if _merge==2

}

*Record the from/thru information for firm's second CUSIP on all firm's observations
*Then we can drop the duplicate CUSIP observation below.
forvalues i=3/4 {

	bysort gvkey: egen from`i'_temp = max(from`i'_min)
	bysort gvkey: egen thru`i'_temp = max(thru`i'_min)
	
	replace from`i'= from`i'_temp
	replace thru`i'= thru`i'_temp

}

format from1_min from3_min from2_min from4_min from1 from2 from3 from4 %td
format thru1_min thru3_min thru2_min thru4_min thru1 thru2 thru3 thru4 %td

drop from3_temp-thru4_temp

*Now drop GVKEY/CUSIP pairs that did not match Compustat
drop if _merge==2
drop _merge

*Create dates to flag firms that were in S&P 500 during a quarter
gen 	min_date=mdy(1,1,year) if qtr==1
replace min_date=mdy(4,1,year) if qtr==2
replace min_date=mdy(7,1,year) if qtr==3
replace min_date=mdy(10,1,year) if qtr==4

gen 	max_date=mdy(3,31,year) if qtr==1
replace max_date=mdy(6,30,year) if qtr==2
replace max_date=mdy(9,30,year) if qtr==3
replace max_date=mdy(12,31,year) if qtr==4

format min_date max_date %td

*Flag firm if in S&P500 at any point during quarter, 
*i.e. in S&P before end of quarter and didn't exit before quarter started
gen in_SP500flg_qtr=(max_date>=from1 & min_date<=thru1)|(max_date>=from2 & min_date<=thru2)|(max_date>=from3 & min_date<=thru3)|(max_date>=from4 & min_date<=thru4)

label var in_SP500flg_qtr "Firm in S&P 500 during quarter"

*******************
*Merge on annual Taylor and Peter's Q data
gen fyear = fyearq
merge m:1 gvkey fyear using "${data}totalQ.dta", keepusing(gvkey fyear K_int K_int_offBS K_int_Know K_int_Org q_tot)
drop if _merge==2
drop _merge
drop fyear

*Winsorize q_tot at 1% tails
sum q_tot, detail
gen 	q_tot_wins = q_tot
replace q_tot_wins = r(p1) if q_tot<r(p1)
replace q_tot_wins = r(p99) if q_tot > r(p99) & !missing(q_tot)

*******************
***DATA CLEANING***
*******************
*Drop unnecessary variables
drop indfmt consol popsrc datafmt tic conm curcdq datafqtr fyearq exchg cik costat incorp from1-thru4_min datadate

order gvkey year qtr in_SP500flg_qtr  

*Create timeseries variables
*Create numeric firm id
destring gvkey, gen(firmid)
*Create quarterly time variable
gen ts_qtr = tq(1988q1) + (year-1988)*4 + (qtr - 1)
format ts_qtr %tq

*Drop exact duplicates.
egen date_tag = tag(firmid ts_qtr)
drop if date_tag==0
drop date_tag

*Check S&P 500 flag (should be approximately 500 firms per quarter)
tab ts_qtr in_SP500flg_qtr

destring sic, replace

*Only keep firms incorporated in the U.S.
keep if fic=="USA"

*Drop firm-quarters with acquisitions greater than 5% of assets
*NOTE: aqcy (acquisition value) is a year-to-date variable by fiscal quarter, not quarterly
*Create quarterly acquisition value variable
xtset firmid ts_qtr 

gen 	aqcq = 0
replace aqcq = aqcy 		 if fqtr==1 & !missing(aqcy)
replace aqcq = aqcy - L.aqcy if fqtr!=1 & !missing(L.aqcy) & !missing(aqcy)

gen 	aqcq_to_assets = aqcq / atq

drop if aqcq_to_assets>0.05 & !missing(atq)

*Drop 901 observations where assets and/or liabilities equal zero or negative
drop if atq<=0|ltq<=0

*Drop observations with missing accounting variables
drop if missing(atq)|missing(ltq)|missing(seqq)

*Drop if firm violates accounting identity by more than 10% of book value of assets
gen at_min_lt_min_seq = atq - ltq - seqq
gen acct_error_pct = at_min_lt_min_seq / atq
sum acct_error_pct, detail
gen flag = (acct_error_pct > .1 | acct_error_pct < -.1)
tab flag, missing
*Drop 14,887 observations that violate accounting identify 
drop if flag
drop at_min_lt_min_seq acct_error_pct flag

*Merge on quarterly MP shock data
*Note: 2019 Q3 and Q4 don't have MP shock data available yet
merge m:1 year qtr using "${data}MP_shocks_quarterly"
drop if _merge==2
drop _merge

merge m:1 year qtr using "${data}MP_shocks_daily_quarterly"
drop if _merge==2
drop _merge

drop pre_crisis post_crisis crisis_flg

*Create time period flags
gen 	pre_crisis = (ts_qtr<=193)

gen 	post_crisis = (ts_qtr>=198)

gen 	crisis_flg = (ts_qtr>193 & ts_qtr<198)

label var pre_crisis "1=July 1991 - June 2008"
label var post_crisis "1=July 2009 - June 2019"
label var crisis_flg "1=financial crisis date (July2008-June2009)"

order gvkey firmid year qtr in_SP500flg_qtr pre_crisis post_crisis crisis_flg

*Merge on firm age
merge m:1 gvkey year using "${data}firm_age_data"
drop if _merge==2
drop _merge

*Create top-coded version of age variable
gen 	age_capped = age
replace age_capped = 48 if age>48 & !missing(age)

label var age_capped "Age, top-coded at 48yrs"

*Set stockholders equity to zero if negative (as in Hadlock & Pierce) to give D2C=1
replace seqq=0 if seqq<0

replace cheq=0 if cheq<0

*******************************
***CREATE LEVERAGE VARIABLES***
*******************************
gen lev_d2a = (dlcq + dlttq) / atq
label var lev_d2a "Leverage, debt to assets, quarterly"

gen lev_d2c = (dlcq + dlttq)/(seqq+dlcq+dlttq)
label var lev_d2c "Leverage, debt to capital, quarterly"

gen lev_d2e = (dlcq + dlttq)/(seqq)
label var lev_d2e "Leverage, debt to equity, quarterly"

*Create share of Total Debt by LT and ST
gen lt_debt_share = dlttq / (dlttq + dlcq)
gen st_debt_share = dlcq / (dlttq + dlcq)

xtset firmid ts_qtr

*Merge on Price Index data to create real sales growth
*Table 1.3.4. Price Indexes for Gross Value Added by Sector (Non-Farm Business Index)
merge m:1 year qtr using "${data}bea_nipa_price_index"
*Drop dates outside of sample range
drop if _merge==1
drop if _merge==2
drop _merge

*Create tangible assets
gen tanq = atq - intanq

label var tanq "Tangible Assets"

replace xrdq=0 if missing(xrdq)

gen K_int_onBS = K_int - K_int_offBS

***Adjust capital stock variables by price level
replace ppentq=ppentq/(price_index/100)
replace ppegtq=ppegtq/(price_index/100)

***Deflate select variables
gen niq_real=niq/(price_index/100)
gen ivltq_real=ivltq/(price_index/100)
gen ivstq_real=ivstq/(price_index/100)
gen invtq_real=invtq/(price_index/100)
gen intanq_real=intanq/(price_index/100)
gen tanq_real=tanq/(price_index/100)
gen xrdq_real=xrdq/(price_index/100)
gen saleq_real=saleq/(price_index/100)
gen K_int_real=K_int/(price_index/100)
gen K_int_offBS_real=K_int_offBS/(price_index/100)
gen K_int_onBS_real=K_int_onBS/(price_index/100)
gen K_int_Know_real=K_int_Know/(price_index/100)
gen K_int_Org_real=K_int_Org/(price_index/100)

*********************
***Create Controls***
*********************
xtset firmid ts_qtr

*Real Log Sales Growth
gen log_salesg_real = (ln(saleq / (price_index/100)) - ln(L4.saleq / (L4.price_index/100))) * 100
label var log_salesg_real "Log Sales Growth, YoY, deflated by price_index"

*Size (log assets)
gen size_assets = ln(atq/(price_index/100))
label var size_assets "log of total assets, quarterly"

*Current Assets as a Share of Total Assets
gen current_total_orig = actq / atq
gen current_total = 1 - (actq / atq)
label var current_total_orig "Current Assets as a Share of Total Assets"
label var current_total "1 - Current Assets as a Share of Total Assets"

gen liquidity_neg 		= 1 - (cheq / atq)
gen liquidity 			= cheq / atq

label var fqtr "Fiscal Quarter"

*Price-to-Cost margin
gen pcmq = (saleq - cogsq) / saleq
label var pcmq 	"Price-to-Cost Margin, quarterly"

*Receivables minus Payables to Sales
gen recpay2yq = (rectq - apq) / saleq
label var recpay2yq "Receivables minus Payables to Sales, quarterly"

*Dpreciation to Assets
gen d2aq = dpq / atq
label var d2aq 				"Depreciation to Assets, quarterly"

*Cash 2 Sales
gen cash2sales = (saleq - cond(missing(cogsq),0,cogsq) - cond(missing(xsgaq),0,xsgaq)) / saleq
label var cash2sales "Sales minus Cost of Goods Sold minus Selling Expenses over Sales, qtrly"

*Quarterly market_cap variable
*Note: Shares Outstanding is missing pre-1998 from daily Compustat/CRSP share data. 
*So use the quarterly market capitalization calculated from Compustat/CRSP dataset.
gen market_cap_q = (cshoq * prccq * 100) / price_index
gen ln_market_cap_q = ln(market_cap_q)

*Create Tobin's q variable
gen tobins_q = (atq + (prccq * cshoq) - ceqq - txditcq) / atq

label var tobins_q "Tobin's Q, quarterly"

*Winsorize tobins_q at 1% tails
sum tobins_q, detail
gen 	tobins_q_wins = tobins_q
replace tobins_q_wins = r(p1) if tobins_q<r(p1)
replace tobins_q_wins = r(p99) if tobins_q > r(p99) & !missing(tobins_q)

merge m:1 firmid qtr year using "${data}distance2default_qtrly"
drop if _merge==2
drop _merge

*Full Period standardization
sum edf
gen edf_std = (edf - r(mean)) / r(sd)

label var edf "Expected Default Frequency"

gen dd = -invnormal(edf)

label var dd "Distance to Default"

xtset firmid ts_qtr
***Standardize quarterly Compustat variables
foreach i in log_salesg_real size_assets current_total_orig current_total liquidity_neg liquidity pcmq recpay2yq d2aq cash2sales  ln_market_cap_q tobins_q lev_d2a lev_d2c lev_d2e lt_debt_share st_debt_share cshoq dd {

	gen `i'_1lag = L.`i'
	
	*Winsorize at 1% tails
	sum `i'_1lag, detail
	replace `i'_1lag = r(p1) if `i'_1lag < r(p1)
	replace `i'_1lag = r(p99) if `i'_1lag > r(p99) & !missing(`i'_1lag)
	
	*Full Period standardization
	sum `i'_1lag
	gen `i'_1lag_std = (`i'_1lag - r(mean)) / r(sd)
	
}

*Drop variables
drop pcmq-cash2sales ln_market_cap_q lev_d2a-st_debt_share

*Create industry variables
gen sic_2dig = floor(sic/100)

gen sic_agric 		 = 0 if sic_2dig!=.
gen sic_mining 		 = 0 if sic_2dig!=.
gen sic_construction = 0 if sic_2dig!=.
gen sic_manuf 		 = 0 if sic_2dig!=.
gen sic_transport 	 = 0 if sic_2dig!=.
gen sic_wholesale 	 = 0 if sic_2dig!=.
gen sic_retail 		 = 0 if sic_2dig!=.
gen sic_fire 	 	 = 0 if sic_2dig!=.
gen sic_services 	 = 0 if sic_2dig!=.

replace sic_agric 		 = 1 if sic_2dig<10
replace sic_mining 		 = 1 if sic_2dig>=10 & sic_2dig<=14
replace sic_construction = 1 if sic_2dig>=15 & sic_2dig<=17
replace sic_manuf 		 = 1 if sic_2dig>=20 & sic_2dig<=39
replace sic_transport 	 = 1 if sic_2dig>=40 & sic_2dig<=49
replace sic_wholesale 	 = 1 if sic_2dig>=50 & sic_2dig<=51
replace sic_retail 		 = 1 if sic_2dig>=52 & sic_2dig<=59
replace sic_fire 		 = 1 if sic_2dig>=60 & sic_2dig<=67
replace sic_services 	 = 1 if sic_2dig>=70 & sic_2dig<=89

gen 	sector = 1 if sic_agric==1
replace sector = 2 if sic_mining==1
replace sector = 3 if sic_construction==1
replace sector = 4 if sic_manuf==1
replace sector = 5 if sic_transport==1
replace sector = 6 if sic_wholesale==1
replace sector = 7 if sic_retail==1
replace sector = 8 if sic_services==1
replace sector = 9 if sic_fire==1

*Create value label for sector variable
label define sector_lbl 1 "Agriculture" 2 "Mining" 3 "Construction" 4 "Manufacturing" 5 "Transportation" 6 "Wholesale" 7 "Retail" 8 "Services" 9 "FIRE"
label values sector sector_lbl

*Create a qtr x sector FE, as in Ottonello & Winberry
egen qtr_industry = group(ts_qtr sector)

*Create quarterly Compustat variables from year-to-date versions
gen 	cdvcq = cdvcy if fqtr==1
replace cdvcq = cdvcy - L.cdvcy if fqtr>1

gen 	ibq = iby if fqtr==1
replace ibq = iby - L.iby if fqtr>1

gen 	pdvcq = pdvcy if fqtr==1
replace pdvcq = pdvcy - L.pdvcy if fqtr>1

*Drop dates outside to our sample period
drop if year<1995 | year>2020

sort gvkey year qtr

label var lev_d2c_1lag "Leverage, debt to capital"
label var lev_d2a_1lag "Leverage, debt to assets"
label var lev_d2e_1lag "Leverage, debt to equity"
label var log_salesg_real_1lag "Log Real Sales Growth, YoY"
label var size_assets_1lag "Log of Total Assets"
label var current_total_1lag "1 - Current Assets as Share of Total Assets"
label var current_total_orig_1lag "Current Assets as Share of Total Assets"
label var liquidity_1lag "Cash as Share of Total Assets"
label var liquidity_neg_1lag "1 - Cash as Share of Total Assets"
label var pcmq_1lag "Price-to-Cost Margin"
label var recpay2yq_1lag "Receivables minus Payables to Sales"
label var d2aq_1lag "Depreciation to Assets"
label var cash2sales_1lag "Sales minus Cost of Goods Sold minus Selling Expenses over Sales"
label var ln_market_cap_q_1lag "Log of Market Cap"
label var tobins_q_1lag "Tobin's q, winsorized at 1% tails"


**************************************************
*Create investment variables*
**************************************************
xtset firmid ts_qtr 
*Create Capital Stock variable following Ottonello & Winberry methodology
*First, need to flag the first date that each firm reports their capital stock
gen 	capital_flag = 0
replace capital_flag = 1 if (ppegtq!=. & L.ppentq!=. & F.ppentq!=.) | (ppegtq!=. & ppentq!=. & F.ppentq!=.)

bysort firmid capital_flag: egen capital_date_temp = min(ts_qtr) if capital_flag==1
bysort firmid: egen capital_date = max(capital_date_temp)
format capital_date %tq

*Next, need to interpolate any missing net investment values between two non-missing values
sort firmid ts_qtr
gen 	ppentq_fill = ppentq
replace ppentq_fill = (L.ppentq + F.ppentq)/2 if L.ppentq!=. & F.ppentq!=. & ppentq==.

gen 	ppegtq_fill = ppegtq
replace ppegtq_fill = (L.ppegtq + F.ppegtq)/2 if L.ppegtq!=. & F.ppegtq!=. & ppegtq==.

*Initialize capital stock value with first date that records the total capital 
*Then update capital stock with change in net capital stock
*If more than 2 consecutive quarters don't report net capital stock, then start
*again with gross capital stock value
gen 	capital_stock = ppegtq if ts_qtr==capital_date
replace capital_stock = L.capital_stock + D.ppentq_fill if ts_qtr!=capital_date
replace capital_stock = ppegtq if capital_stock==.
replace capital_stock = L.capital_stock + D.ppentq_fill if ts_qtr!=capital_date & L.capital_stock!=.

label var capital_stock "Capital Stock, quarterly"
drop capital_flag capital_date_temp capital_date 

*Create investment variable as percentage change in capital stock
gen investment_intensive = (log(capital_stock) - log(L.capital_stock)) * 100

label var investment_intensive "Investment, Intensive Margin"

*Assume a 10% depreciation rate as average
gen investment_extensive = ((D.capital_stock/L.capital_stock)>0.10)
replace investment_extensive = . if missing(D.capital_stock)

label var investment_extensive "Investment, Extensive Margin"


*Drop variables
drop fyear fyr seqq-fic  min_date max_date aqcq aqcq_to_assets

*Order variables
order gvkey firmid cusip year qtr in_SP500flg_qtr pre_crisis post_crisis crisis_flg capital_stock investment_intensive

*Create 8-digit CUSIP consistent with ivol dataset
gen cusip_8 = substr(cusip,1,8)

gen K_tot = capital_stock + K_int_real

*merge on P&T quarterly variables
merge 1:1 gvkey year qtr using "${data}PT_quarterly"
drop if _merge==2
drop _merge

destring gvkey, gen(gvkey_num)
rename gvkey gvkey_str
rename gvkey_num gvkey

merge 1:1 gvkey_str year qtr using "${data}qtrly_ivol_data"
drop if _merge==2
drop _merge

merge m:1 year qtr using "${data}agg_econ_vars"
drop if _merge==2
drop _merge

merge m:1 year qtr using "${data}VIX_EOQ"
drop if _merge==2
drop _merge

sort gvkey year qtr

*Save .dta file 
save "${data}Compustat_Quarterly_Final", replace

********************************************************************************
********************************************************************************
********************************************************************************
********************************************************************************

